### Replication code for Taylor C. Boas, F. Daniel Hidalgo, and Guillermo Toral. "Competence versus Priorities: Negative Electoral Responses to Education Quality in Brazil"
### This file cleans the dataset from the online survey experiment
### R version, platform, and package versions reported at the end of the file
### June 19, 2020

# PREPARE THE ENVIRONMENT -------------------------------------------------
# Set Working Directory to wherever this file is located.

# The directory where this file is located must also have a "figures" and a "tables" subdirectory
# Clean the environment
rm(list = ls())
# Load packages (make sure they are previously installed)
library(tidyverse); library(multcomp); library(xtable); library(texreg); library(Matching); library(ebal); library(cobalt); library(lmtest); library(codebook)

# IMPORT AND CLEAN DATA -------------------------------------------------
# Import data as downloaded from Qualtrics, after anonymizing it. 
d <- read_csv("data/online_survey_anonymized.csv")
# Remove column information provided by Qualtrics
d <- d[c(3:nrow(d)),]
### Commented out below is the code we used to anonymize the dataset by removing IP address and geolocation of respondents. Before doing that, we removed observations with duplicate IP addresses
# Import data as downloaded from Qualtrics
# q <- read_csv("data/online_survey.csv")
# qt <- read_csv("data/online_survey_text.csv")
# # Save column information provided by Qualtrics, for appending to the anonymized data
# header <- q[c(1:2),]
# header_t <- qt[c(1:2),]
# # Remove column information provided by Qualtrics
# q <- q[c(3:nrow(q)),]
# qt <- qt[c(3:nrow(qt)),]
# # Remove duplicated IP addresses
# q <- q[!duplicated(q$IPAddress),]
# qt <- qt[!duplicated(qt$IPAddress),]
# # Remove variables that could identify respondents
# q <- q %>%
  # dplyr::select(-c("IPAddress", "LocationLatitude", "LocationLongitude"))
# header <- header %>%
  # dplyr::select(-c("IPAddress", "LocationLatitude", "LocationLongitude"))
# qt <- qt %>%
  # dplyr::select(-c("IPAddress", "LocationLatitude", "LocationLongitude"))
# header_t <- header_t %>%
  # dplyr::select(-c("IPAddress", "LocationLatitude", "LocationLongitude"))
# # Append the Qualtrics header to the anonymized dataset
# q <- rbind(header, q)
# qt <- rbind(header_t, qt)
# write_csv(q, "data/online_survey_anonymized.csv")
# write_csv(qt, "data/online_survey_text_anonymized.csv")

# Treatment group indicators
d$treatment <- ifelse(d$Group=="Treatment",1,0)
d$control <- ifelse(d$Group=="Control",1,0)
# Indicators for whether treated respondents receive positive or negative information (i.e., were IDEB targets met in the municipality where they reported to live?)
d$treatment_pos <- ifelse(d$Group=="Treatment" & d$ideb==1,1,0) 
d$treatment_neg <- ifelse(d$Group=="Treatment" & d$ideb==0,1,0)
# Priority given to different policy areas (1= highest priority, 5 = lowest priorirty)
d$priority_education <- as.numeric(d$p7_1)
d$priority_education_low <- ifelse(d$priority_education>median(d$priority_education,na.rm=T),1,0)
d$priority_education_very_low <- ifelse(d$priority_education>median(d$priority_education,na.rm=T)+1,1,0)
d$priority_healthcare <- as.numeric(d$p7_2)
d$priority_economy <- as.numeric(d$p7_5)
d$priority_socialassistance <- as.numeric(d$p7_6)
d$priority_security <- as.numeric(d$p7_7)
# Indicator for whether they had heard of IDEB before
d$had_heard_of_ideb <- ifelse(d$p8==1,1,0)
# Perceptions of investment and improvement by policy area
# Respondents were asked first about education and then, in random order, about the other policy areas
d$mayor_invested_education <- as.numeric(d$p11_1)
d$mayor_invested_healthcare <- ifelse(d$Area1=="SAÚDE",as.numeric(d$p11_2),ifelse(d$Area2=="SAÚDE",as.numeric(d$p11_3),ifelse(d$Area3=="SAÚDE",as.numeric(d$p11_4),NA)))
d$mayor_invested_socialassistance <- ifelse(d$Area1=="ASSISTÊNCIA SOCIAL",as.numeric(d$p11_2),ifelse(d$Area2=="ASSISTÊNCIA SOCIAL",as.numeric(d$p11_3),ifelse(d$Area3=="ASSISTÊNCIA SOCIAL",as.numeric(d$p11_4),NA)))
d$mayor_invested_security <- ifelse(d$Area1=="SEGURANÇA",as.numeric(d$p11_2),ifelse(d$Area2=="SEGURANÇA",as.numeric(d$p11_3),ifelse(d$Area3=="SEGURANÇA",as.numeric(d$p11_4),NA)))
d$mayor_improved_education <- as.numeric(d$p12_1)
d$mayor_improved_healthcare <- ifelse(d$Area1=="SAÚDE",as.numeric(d$p12_2),ifelse(d$Area2=="SAÚDE",as.numeric(d$p12_3),ifelse(d$Area3=="SAÚDE",as.numeric(d$p12_4),NA)))
d$mayor_improved_socialassistance <- ifelse(d$Area1=="ASSISTÊNCIA SOCIAL",as.numeric(d$p12_2),ifelse(d$Area2=="ASSISTÊNCIA SOCIAL",as.numeric(d$p12_3),ifelse(d$Area3=="ASSISTÊNCIA SOCIAL",as.numeric(d$p12_4),NA)))
d$mayor_improved_security <- ifelse(d$Area1=="SEGURANÇA",as.numeric(d$p12_2),ifelse(d$Area2=="SEGURANÇA",as.numeric(d$p12_3),ifelse(d$Area3=="SEGURANÇA",as.numeric(d$p12_4),NA)))
# Revert the scale so that higher values mean more agreement
d$mayor_invested_education <- dplyr::recode(d$mayor_invested_education, '1'=4, '2'=3, '3'=2, '4'=1)
d$mayor_invested_healthcare <- dplyr::recode(d$mayor_invested_healthcare, '1'=4, '2'=3, '3'=2, '4'=1)
d$mayor_invested_socialassistance <- dplyr::recode(d$mayor_invested_socialassistance, '1'=4, '2'=3, '3'=2, '4'=1)
d$mayor_invested_security <- dplyr::recode(d$mayor_invested_security, '1'=4, '2'=3, '3'=2, '4'=1)
d$mayor_improved_education <- dplyr::recode(d$mayor_improved_education, '1'=4, '2'=3, '3'=2, '4'=1)
d$mayor_improved_healthcare <- dplyr::recode(d$mayor_improved_healthcare, '1'=4, '2'=3, '3'=2, '4'=1)
d$mayor_improved_socialassistance <- dplyr::recode(d$mayor_improved_socialassistance, '1'=4, '2'=3, '3'=2, '4'=1)
d$mayor_improved_security <- dplyr::recode(d$mayor_improved_security, '1'=4, '2'=3, '3'=2, '4'=1)
# Demographics
d$brazilian <- ifelse(d$p2==1,1,0)
d$lives_in_brazil <- ifelse(d$p3==1,1,0)
d$age <- as.numeric(d$p4)+16
d$age <- ifelse(d$age<18,NA,d$age)
d$female <- ifelse(as.numeric(d$p14)==2,1,0)
d$education <- as.numeric(d$p15)-10
d$education_lessthanprimary <- ifelse(d$education<4,1,0)
d$education_lessthanprimary <- ifelse(d$education<3,1,0)
d$education_primary <- ifelse(d$education>2 & d$education<5,1,0)
d$education_secondary <- ifelse(d$education %in% c(5,6,8),1,0) 
d$education_higher <- ifelse(d$education %in% c(7,9,10),1,0)
# Racial categories
d$cor <- as.numeric(d$p13)
d$race_white <- ifelse(d$cor==1,1,0)
d$race_brown <- ifelse(d$cor==3,1,0)
d$race_other <- ifelse(d$cor!=1 & d$cor!=3,1,0)
# State and region dummies
d$estado <- as.factor(d$p5)
for(level in unique(d$estado)){
  d[paste("uf", level, sep = "_")] <- ifelse(d$estado == level, 1, 0)
}
d$region_north <- ifelse(d$p5 %in% c(28,30,31,41,48,49),1,0)
d$region_centerwest <- ifelse(d$p5 %in% c(34,36,39,40),1,0)
d$region_southeast <- ifelse(d$p5 %in% c(35,38,46,53),1,0)
d$region_south <- ifelse(d$p5 %in% c(45,50,51),1,0)
# Post-stratification weights based on education (relative to education levels, as per the census, in the municipalities within the RDD bandwidth (see Appendix D of the paper))
d$weights_census_rdd <- ifelse(d$education_lessthanprimary,0.589/mean(d$education_lessthanprimary,na.rm=T),
                               ifelse(d$education_primary,0.15/mean(d$education_primary,na.rm=T),
                                      ifelse(d$education_secondary,0.206/mean(d$education_secondary,na.rm=T),
                                             0.056/mean(d$education_higher,na.rm=T))))
# Remove invalid respondents (i.e., those who are not in Brazil, are not Brazilian, are minors, did not finish the survey, or live in a municipality with no IDEB score and target)
d <- d[d$ip_country=="Brazil" & d$brazilian==1 & d$lives_in_brazil==1 & !is.na(d$age) & !is.na(d$p15) & !is.na(d$ideb),]
# Remove columns we will not use
dd <- d %>%
  dplyr::select(ideb, treatment, control, treatment_pos, treatment_neg, # Variables related to treatment
                priority_education, priority_education_low, priority_education_very_low, priority_healthcare, priority_economy, priority_socialassistance, priority_security, # Variables related to how they rank policy areas
                mayor_invested_education, mayor_invested_healthcare, mayor_invested_socialassistance, mayor_invested_security, mayor_improved_education, mayor_improved_healthcare, mayor_improved_socialassistance, mayor_improved_security, # Outcome variables
                age, female, education, race_white, race_brown, race_other, region_north, region_centerwest, region_southeast, region_south, had_heard_of_ideb, weights_census_rdd) # Controls and other variables

# EXPORT CLEAN DATASET ----------------------------------------------------------

write_csv(dd, "data/online_survey_cleaned.csv")

# GENERATE CODEBOOK -------------------------------------------------------

var_label(dd) <- list(
  ideb = "Indicator for whether the respondent's self-reported municipality met its IDEB target",
  treatment = "Indicator for whether the respondent is in the treatment group and thus presented information about their municipality's performance in IDEB",
  control = "Indicator for whether the respondent is in the control group and thus not presented information about their municipality's performance in IDEB",
  treatment_pos = "Indicator for whether the respondent is in the treatment group and their municipality met its IDEB target",
  treatment_neg = "Indicator for whether the respondent is in the treatment group and their municipality did not meet its IDEB target",
  had_heard_of_ideb = "Indicator for whether the respondents reports having heard of IDEB before",
  priority_education = "Respondent's ranking given to education as a priority area",
  priority_education_low = "Indicator for whether the respondent places at least two policy areas above education",
  priority_education_very_low = "Indicator for whether the respondent places at least three policy areas above education",
  priority_healthcare = "Respondent's ranking given to healthcare as a priority area",
  priority_economy = "Respondent's ranking given to the economy as a priority area",
  priority_socialassistance = "Respondent's ranking given to social assistance as a priority area",
  priority_security = "Respondent's ranking given to security as a priority area",
  mayor_invested_education = "Resopndent's level of agreement with the following statement -- The mayor invested a lot of money in education",
  mayor_invested_healthcare = "Resopndent's level of agreement with the following statement -- The mayor invested a lot of money in healthcare",
  mayor_invested_socialassistance = "Resopndent's level of agreement with the following statement -- The mayor invested a lot of money in social assistance",
  mayor_invested_security = "Resopndent's level of agreement with the following statement -- The mayor invested a lot of money in security",
  mayor_improved_education = "Resopndent's level of agreement with the following statement -- The mayor improved the quality of education",
  mayor_improved_healthcare = "Resopndent's level of agreement with the following statement -- The mayor improved the quality of healthcare",
  mayor_improved_socialassistance = "Resopndent's level of agreement with the following statement -- The mayor improved the quality of social assistance",
  mayor_improved_security = "Resopndent's level of agreement with the following statement -- The mayor improved the quality of security",
  age = "Respondent's self-reported age",
  female = "Indicator for whether the respondent identifies as a woman",
  education = "Respondent's self-reported level of education",
  race_white = "Indicator for whether the respondent identifies as white",
  race_brown = "Indicator for whether the respondent identifies as brown",
  race_other = "Indicator for whether the respondent identifies as other racial category",
  region_north = "Indicator for whether the respondent reports living in a state in the northern region of Brazil",
  region_centerwest = "Indicator for whether the respondent reports living in a state in the central-western region of Brazil",
  region_southeast = "Indicator for whether the respondent reports living in a state in the southeastern region of Brazil",
  region_south = "Indicator for whether the respondent reports living in a state in the southern region of Brazil",
  weights_census_rdd = "Respondent's post-stratification weight based on their education level"
  )

val_labels(dd$mayor_invested_education) <- c("I agree completely" = 4, "I agree partially" = 3, "I disagree partially" = 2, "I disagree completely" = 1)
val_labels(dd$mayor_invested_healthcare) <- c("I agree completely" = 4, "I agree partially" = 3, "I disagree partially" = 2, "I disagree completely" = 1)
val_labels(dd$mayor_invested_socialassistance) <- c("I agree completely" = 4, "I agree partially" = 3, "I disagree partially" = 2, "I disagree completely" = 1)
val_labels(dd$mayor_invested_security) <- c("I agree completely" = 4, "I agree partially" = 3, "I disagree partially" = 2, "I disagree completely" = 1)
val_labels(dd$mayor_improved_education) <- c("I agree completely" = 4, "I agree partially" = 3, "I disagree partially" = 2, "I disagree completely" = 1)
val_labels(dd$mayor_improved_healthcare) <- c("I agree completely" = 4, "I agree partially" = 3, "I disagree partially" = 2, "I disagree completely" = 1)
val_labels(dd$mayor_improved_socialassistance) <- c("I agree completely" = 4, "I agree partially" = 3, "I disagree partially" = 2, "I disagree completely" = 1)
val_labels(dd$mayor_improved_security) <- c("I agree completely" = 4, "I agree partially" = 3, "I disagree partially" = 2, "I disagree completely" = 1)
val_labels(dd$education) <- c("No education" = 1, "Incomplete basic education" = 2, "Complete basic education" = 3, "Incomplete high school" = 4, "Complete high school" = 5, "Incomplete technical degree" = 6, "Complete technical degree" = 7, "Incomplete tertiary degree" = 8, "Complete tertiary degree" = 9, "Masters, doctorate, or other graduate degree" = 10 )

metadata(dd)$name <- "online_survey_cleaned.csv"
metadata(dd)$description <- "Codebook for the dataset used for all online survey experiment analyses in Competence versus Priorities: Negative Electoral Responses to Education Quality in Brazil"

# codebook(dd) # This produces a more detailed codebook with summary statistics and histograms
codebook:::label_browser_static(dd)

# NOTES -- R version, platform, and loaded packages -------------------------
# sessionInfo(package = NULL)
# R version 3.6.3 (2020-02-29)
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalina 10.15.3
# 
# Matrix products: default
# BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
# LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
# 
# Random number generation:
#   RNG:     Mersenne-Twister 
# Normal:  Inversion 
# Sample:  Rounding 
# 
# locale:
#   [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# attached base packages:
#   [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
#   [1] cobalt_4.1.0      ebal_0.1-6        Matching_4.9-7    multcomp_1.4-13   TH.data_1.0-10   
# [6] MASS_7.3-51.5     mvtnorm_1.1-0     Hmisc_4.4-0       lattice_0.20-40   timelineS_0.1.1  
# [11] lfe_2.8-5         Matrix_1.2-18     xtable_1.8-4      texreg_1.37.1     rdrobust_0.99.7  
# [16] rdd_0.57          Formula_1.2-3     AER_1.2-9         survival_3.1-11   car_3.0-7        
# [21] carData_3.0-3     lmtest_0.9-37     zoo_1.8-7         sandwich_2.5-1    shiny_1.4.0.2    
# [26] readxl_1.3.1      codebook_0.9.2    electionsBR_0.3.1 forcats_0.5.0     stringr_1.4.0    
# [31] dplyr_1.0.0       purrr_0.3.3       readr_1.3.1       tidyr_1.0.2       tibble_3.0.0     
# [36] ggplot2_3.3.0     tidyverse_1.3.0  
# 
# loaded via a namespace (and not attached):
#   [1] colorspace_1.4-1    ellipsis_0.3.0      rio_0.5.16          htmlTable_1.13.3   
# [5] base64enc_0.1-3     fs_1.4.1            rstudioapi_0.11     listenv_0.8.0      
# [9] farver_2.0.3        DT_0.13             fansi_0.4.1         lubridate_1.7.4    
# [13] xml2_1.3.0          codetools_0.2-16    splines_3.6.3       knitr_1.28         
# [17] jsonlite_1.6.1      broom_0.5.5         cluster_2.1.0       dbplyr_1.4.2       
# [21] png_0.1-7           compiler_3.6.3      httr_1.4.1          backports_1.1.5    
# [25] assertthat_0.2.1    fastmap_1.0.1       cli_2.0.2           later_1.0.0        
# [29] acepack_1.4.1       htmltools_0.4.0     tools_3.6.3         gtable_0.3.0       
# [33] glue_1.3.2          Rcpp_1.0.4          cellranger_1.1.0    vctrs_0.3.1        
# [37] nlme_3.1-145        crosstalk_1.1.0.1   xfun_0.12           globals_0.12.5     
# [41] openxlsx_4.1.4      rvest_0.3.5         mime_0.9            miniUI_0.1.1.1     
# [45] lifecycle_0.2.0     future_1.17.0       scales_1.1.0        hms_0.5.3          
# [49] promises_1.1.0      parallel_3.6.3      RColorBrewer_1.1-2  yaml_2.2.1         
# [53] curl_4.3            gridExtra_2.3       rpart_4.1-15        labelled_2.5.0     
# [57] latticeExtra_0.6-29 stringi_1.4.6       highr_0.8           checkmate_2.0.0    
# [61] rmdpartials_0.5.8   zip_2.0.4           repr_1.1.0          rlang_0.4.6        
# [65] pkgconfig_2.0.3     evaluate_0.14       htmlwidgets_1.5.1   labeling_0.3       
# [69] tidyselect_1.1.0    magrittr_1.5        R6_2.4.1            generics_0.0.2     
# [73] DBI_1.1.0           pillar_1.4.3        haven_2.3.1         foreign_0.8-76     
# [77] withr_2.1.2         nnet_7.3-13         abind_1.4-5         modelr_0.1.6       
# [81] crayon_1.3.4        utf8_1.1.4          rmarkdown_2.1       jpeg_0.1-8.1       
# [85] grid_3.6.3          data.table_1.12.8   reprex_0.3.0        digest_0.6.25      
# [89] httpuv_1.5.2        munsell_0.5.0       skimr_2.1.1  